suppressMessages(library(scater))
suppressMessages(library(ggplot2))
suppressMessages(library(ggpubr)) 
yg0 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_yg0.rds")
yg1 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_yg1.rds")
yg3 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_yg3.rds")
yg5 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_yg5.rds")

old0 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_old0.rds")
old1 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_old1.rds")
old3 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_old3.rds")
old5 <- readRDS("/mnt/nmorais-nfs/marta/pB_joana/pB_data/sce_old5.rds")
yg0_mat <- counts(yg0)
yg1_mat <- counts(yg1)
yg3_mat <- counts(yg3)
yg5_mat <- counts(yg5)

old0_mat <- counts(old0)
old1_mat <- counts(old1)
old3_mat <- counts(old3)
old5_mat <- counts(old5)
main_matrix <- cbind(as.matrix(yg0_mat), 
                     as.matrix(yg1_mat), 
                     as.matrix(yg3_mat),
                     as.matrix(yg5_mat),
                     as.matrix(old0_mat),
                     as.matrix(old1_mat),
                     as.matrix(old3_mat),
                     as.matrix(old5_mat))
keep_genes <- rowSums(main_matrix) > 0
table(keep_genes)
## keep_genes
## FALSE  TRUE 
##  5146 27139
main_matrix <- main_matrix[keep_genes,]
dim(main_matrix)
## [1] 27139 65006
sample <- c(rep("YG0",7505),
            rep("YG1", 5724),
            rep("YG3", 7284),
            rep("YG5", 6111),
            rep("OLD0", 8237),
            rep("OLD1", 7596),
            rep("OLD3", 5647),
            rep("OLD5", 16902)
  
)
table(sample)
## sample
##  OLD0  OLD1  OLD3  OLD5   YG0   YG1   YG3   YG5 
##  8237  7596  5647 16902  7505  5724  7284  6111
sorting_day <- c(rep("day1",7505),
            rep("day2", 5724),
            rep("day1", 7284),
            rep("day2", 6111),
            rep("day1", 8237),
            rep("day2", 7596),
            rep("day1", 5647),
            rep("day2", 16902)
  
)
table(sorting_day)
## sorting_day
##  day1  day2 
## 28673 36333
table(sorting_day, sample)
##            sample
## sorting_day  OLD0  OLD1  OLD3  OLD5   YG0   YG1   YG3   YG5
##        day1  8237     0  5647     0  7505     0  7284     0
##        day2     0  7596     0 16902     0  5724     0  6111
lane <- c(rep("lane1",7505),
            rep("lane1", 5724),
            rep("lane2", 7284),
            rep("lane2", 6111),
            rep("lane1", 8237),
            rep("lane1", 7596),
            rep("lane2", 5647),
            rep("lane2", 16902)
  
)
table(lane)
## lane
## lane1 lane2 
## 29062 35944
table(lane, sample)
##        sample
## lane     OLD0  OLD1  OLD3  OLD5   YG0   YG1   YG3   YG5
##   lane1  8237  7596     0     0  7505  5724     0     0
##   lane2     0     0  5647 16902     0     0  7284  6111
total_counts <- c(yg0$total_counts, 
                  yg1$total_counts, 
                  yg3$total_counts, 
                  yg5$total_counts,
                  old0$total_counts, 
                  old1$total_counts, 
                  old3$total_counts, 
                  old5$total_counts)
total_features <- c(yg0$total_features, 
                    yg1$total_features, 
                    yg3$total_features, 
                    yg5$total_features,
                    old0$total_features, 
                    old1$total_features, 
                    old3$total_features, 
                    old5$total_features)
pct_mt <- c(yg0$percentage_mt, 
            yg1$percentage_mt, 
            yg3$percentage_mt, 
            yg5$percentage_mt,
            old0$percentage_mt, 
            old1$percentage_mt, 
            old3$percentage_mt, 
            old5$percentage_mt)
df_colData <- data.frame(total_counts <- total_counts,
                         total_features <- total_features,
                         pct_mt <- pct_mt,
                         log10_total_counts <- log10(total_counts+1),
                         log10_total_features <- log10(total_features +1),
                         log10_percentage_mt <- log10(pct_mt + 0.01))
colnames(df_colData) <- c("total_counts",
                          "total_features",
                          "percentage_mt",
                          "log10_total_counts",
                          "log10_total_features",
                          "log10_percentage_mt")
df_colData[1:5,]
summary(df_colData$total_counts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     500    2238    8042    9017   13408   74630
quantile(df_colData$total_counts, c(0.05,0.95))
##       5%      95% 
##   746.00 22724.75
summary(df_colData$total_features)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      57    1092    2537    2424    3445    7963
quantile(df_colData$total_features, c(0.05,0.95))
##   5%  95% 
##  441 4666
summary(df_colData$percentage_mt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.315   1.970   3.527   2.811  97.670
quantile(df_colData$percentage_mt, c(0.05,0.95))
##        5%       95% 
## 0.3830922 8.8435374
summary(df_colData$log10_total_counts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.700   3.350   3.905   3.752   4.127   4.873
quantile(df_colData$log10_total_counts, c(0.05,0.95))
##       5%      95% 
## 2.873321 4.356518
summary(df_colData$log10_total_features)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.763   3.039   3.404   3.282   3.537   3.901
quantile(df_colData$log10_total_features, c(0.05,0.95))
##       5%      95% 
## 2.645422 3.669038
summary(df_colData$log10_percentage_mt)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.0000  0.1222  0.2968  0.2816  0.4505  1.9898
quantile(df_colData$log10_percentage_mt, c(0.05,0.95))
##         5%        95% 
## -0.4055058  0.9471168
gghistogram(data=df_colData, 
            x = "total_counts",
            y = "..count..",
            bins=(93),
                 alpha = .2,
                 color="black",
                 fill="red",
            title="Library size",
            ylab="#cells",
            xlab="Total counts"
                 ) + 
  theme_bw() +
  theme(text = element_text(size=40))

gghistogram(data=df_colData, 
            x = "log10_total_counts",
            y = "..count..",
            bins=(94),
                 alpha = .2,
                 color="black",
                 fill="white",
            title="Library size",
            ylab="#cells",
            xlab="log10(total counts)"
                 ) + 
  theme_bw() +
  theme(text = element_text(size=20)) + geom_vline(xintercept = 3.5, size = 1, col = "red") + 
scale_x_continuous(breaks = seq(3, 4.5, by=0.5))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print("h")
## [1] "h"
table(df_colData$log10_total_counts >= 3.5)
## 
## FALSE  TRUE 
## 19850 45156
gghistogram(data=df_colData, 
            x = "total_features",
            y = "..count..",
            bins=(94),
                 alpha = .2,
                 color="black",
                 fill="red",
            title="Total features detected",
            ylab="#cells",
            xlab="Total features"
                 ) + 
  theme_bw() +
  theme(text = element_text(size=40))

gghistogram(data=df_colData, 
            x = "log10_total_features",
            y = "..count..",
            bins=(94),
                 alpha = .2,
                 color="black",
                 fill="white",
            title="Number of detected features",
            ylab="#cells",
            xlab="log10(total features)"
                 ) + 
  theme_bw() +
  theme(text = element_text(size=20)) + geom_vline(xintercept = 3, col = "red", size = 1)

table(df_colData$log10_total_counts >= 3.2)
## 
## FALSE  TRUE 
## 12331 52675
ggplot(df_colData, 
       aes(y=total_features, x=total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("") +
  xlab("Total counts") + ylab("Unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) 

ggplot(df_colData, 
       aes(y=total_features, x=total_counts, color = sample)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("") +
  xlab("Total counts") + ylab("Unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  scale_color_manual(values=c("blue","green", "grey", "red", "orange",
                                "yellow", "pink", "cyan"))

ggplot(df_colData, 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) 

ggplot(df_colData, 
       aes(y=log10_total_features, x=log10_total_counts, color = sample)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  scale_color_manual(values=c("blue","green", "grey", "red", "orange",
                                "yellow", "pink", "cyan"))

ggplot(df_colData, 
       aes(y=percentage_mt, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("") +
  xlab("log10(total counts)") + ylab("% mt-genes")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
   geom_hline(yintercept = 10, col = "red")

ggplot(df_colData, 
       aes(y=percentage_mt, x=log10_total_counts, 
           color = log10(main_matrix["S100a8",]+1)
           ))+
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("Coloured by S100a8 expression") +
  xlab("log10 Total counts") + ylab("Percentage mt-genes")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  ylim(0,25)  +
  scale_colour_gradient(low = "grey", high = "red", na.value = NA)
## Warning: Removed 1511 rows containing missing values (`geom_point()`).

ggplot(df_colData, 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("") +
  xlab("log10(total counts)") + ylab("log10(total features)")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  geom_vline(xintercept = 3.5, col = "red") + geom_hline(yintercept = 3, col = "red")

ggplot(df_colData[sample == "YG0",], 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("Cutoff suggestion - YG0") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)

ggplot(df_colData[sample == "YG1",], 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("Cutoff suggestion - YG1") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)

ggplot(df_colData[sample == "YG3",], 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("Cutoff suggestion - YG3") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)

ggplot(df_colData[sample == "YG5",], 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("Cutoff suggestion - YG5") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)

ggplot(df_colData[sample == "OLD0",], 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("Cutoff suggestion - OLD0") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)

ggplot(df_colData[sample == "OLD1",], 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("Cutoff suggestion - OLD1") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)

ggplot(df_colData[sample == "OLD3",], 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("Cutoff suggestion - OLD3") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)

ggplot(df_colData[sample == "OLD5",], 
       aes(y=log10_total_features, x=log10_total_counts, color = percentage_mt)) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("Cutoff suggestion - OLD5") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  geom_vline(xintercept = 3.5) + geom_hline(yintercept = 3)

Keep cells with total counts >= 3.0 and total features >= 2.5

keep_cells <- df_colData$log10_total_counts >= 3.5 & df_colData$log10_total_features >= 3 &
              df_colData$percentage_mt < 10
table(keep_cells)
## keep_cells
## FALSE  TRUE 
## 20158 44848
gghistogram(data=df_colData[keep_cells,], 
            x = "log10_total_counts",
            y = "..count..",
            bins=(94),
                 alpha = .2,
                 color="black",
                 fill="white",
            title="Library size - after QC",
            ylab="#cells",
            xlab="log10(total counts)"
                 ) + 
  theme_bw() +
  theme(text = element_text(size=20)) 

gghistogram(data=df_colData[keep_cells,], 
            x = "log10_total_features",
            y = "..count..",
            bins=(94),
                 alpha = .2,
                 color="black",
                 fill="white",
            title="Number of detected features - after QC",
            ylab="#cells",
            xlab="log10(total features)"
                 ) + 
  theme_bw() +
  theme(text = element_text(size=20)) 

ggplot(df_colData[keep_cells,], 
       aes(y=log10_total_features, x=log10_total_counts, 
           color = (log10(main_matrix["S100a8",]+1)[keep_cells]))) +
  geom_jitter(size=1, alpha=0.75) +
  ggtitle("After QC - coloured by S100a8 expression") +
  xlab("log10 Total counts") + ylab("log10 unique features detected")+
  theme_bw() +
  theme(text = element_text(size=30), legend.title=element_blank()) +
  scale_colour_gradient(low = "grey", high = "red", na.value = NA)

table(keep_cells, sample)
##           sample
## keep_cells  OLD0  OLD1  OLD3  OLD5   YG0   YG1   YG3   YG5
##      FALSE  1328  1425   585 12610  1436   946  1186   642
##      TRUE   6909  6171  5062  4292  6069  4778  6098  5469
table(sample)
## sample
##  OLD0  OLD1  OLD3  OLD5   YG0   YG1   YG3   YG5 
##  8237  7596  5647 16902  7505  5724  7284  6111
pct_excluded <- as.matrix(table(keep_cells, sample))
pct_excluded
##           sample
## keep_cells  OLD0  OLD1  OLD3  OLD5   YG0   YG1   YG3   YG5
##      FALSE  1328  1425   585 12610  1436   946  1186   642
##      TRUE   6909  6171  5062  4292  6069  4778  6098  5469
pct_excluded[1,]/as.vector(table(sample))*100
##     OLD0     OLD1     OLD3     OLD5      YG0      YG1      YG3      YG5 
## 16.12237 18.75987 10.35948 74.60656 19.13391 16.52690 16.28226 10.50565
pct_excluded[2,]/as.vector(table(sample))*100
##     OLD0     OLD1     OLD3     OLD5      YG0      YG1      YG3      YG5 
## 83.87763 81.24013 89.64052 25.39344 80.86609 83.47310 83.71774 89.49435
main_matrix <- main_matrix[,keep_cells]
dim(main_matrix)
## [1] 27139 44848
sce <- SingleCellExperiment(assays = list(counts = as.matrix(main_matrix)))
rowData(sce)$feature_ids <- yg1$feature.ids[keep_genes]
genes_to_keep <- apply(
  counts(sce),
  1,
  function(x) length(x[x > 1]) >= 1 ) 
table(genes_to_keep)
## genes_to_keep
## FALSE  TRUE 
##  5796 21343
sce <- sce[genes_to_keep,]
sce
## class: SingleCellExperiment 
## dim: 21343 44848 
## metadata(0):
## assays(1): counts
## rownames(21343): Xkr4 Rp1 ... CAAA01147332.1 AC149090.1
## rowData names(0):
## colnames(44848): AAACCCAAGCCTTTCC-1 AAACCCAAGTCTTCCC-1 ...
##   TTTGTTGTCGCTTACC-1 TTTGTTGTCTCTAAGG-1
## colData names(0):
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
grep("mt-", rownames(sce), value = TRUE)
##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
##  [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"
mt_genes <- grep("mt-", rownames(sce), value = TRUE)
QC_metrics <- perCellQCMetrics(sce, exprs_values="counts", subsets=list(MT = mt_genes))
QC_metrics[1:5,]
## DataFrame with 5 rows and 6 columns
##                          sum  detected subsets_MT_sum subsets_MT_detected
##                    <numeric> <numeric>      <numeric>           <numeric>
## AAACCCAAGCCTTTCC-1      9103      2857            323                  13
## AAACCCAAGTCTTCCC-1     17564      4075            329                  12
## AAACCCACAGACACAG-1     12299      3433            363                  12
## AAACCCAGTACCTATG-1      7512      2515            161                  11
## AAACCCAGTCACTTCC-1     34858      5044            790                  13
##                    subsets_MT_percent     total
##                             <numeric> <numeric>
## AAACCCAAGCCTTTCC-1            3.54828      9103
## AAACCCAAGTCTTCCC-1            1.87315     17564
## AAACCCACAGACACAG-1            2.95146     12299
## AAACCCAGTACCTATG-1            2.14324      7512
## AAACCCAGTCACTTCC-1            2.26634     34858
sce$total_counts <- QC_metrics$sum
sce$log10_total_counts <- log10(QC_metrics$sum+1)
sce$log10_total_features <- log10(QC_metrics$detected+1)
sce$total_features <- QC_metrics$detected
sce$pct_mt <- QC_metrics$subsets_MT_percent
sce$log10_pct_mt <- log10(QC_metrics$subsets_MT_percent+0.01)
sce$sample <- sample[keep_cells]
sce$sorting_day <- sorting_day[keep_cells]
sce$lane <- lane[keep_cells] 
sce
## class: SingleCellExperiment 
## dim: 21343 44848 
## metadata(0):
## assays(1): counts
## rownames(21343): Xkr4 Rp1 ... CAAA01147332.1 AC149090.1
## rowData names(0):
## colnames(44848): AAACCCAAGCCTTTCC-1 AAACCCAAGTCTTCCC-1 ...
##   TTTGTTGTCGCTTACC-1 TTTGTTGTCTCTAAGG-1
## colData names(9): total_counts log10_total_counts ... sorting_day lane
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
#saveRDS(sce, "/mnt/nmorais-nfs/marta/pB_joana-data/pC_data/sce_all-samples-after-qc.rds")